# Import required R libraries
#library(AppliedPredictiveModeling)
library(caret)
library(tidyverse)
#library(pls)
#library(elasticnet)
#library(corrplot)
library(readxl)
library(writexl)
library(naniar)
library(corrplot)This is role playing. I am your new boss. I am in charge of production at ABC Beverage and you are a team of data scientists reporting to me. My leadership has told me that new regulations are requiring us to understand our manufacturing process, the predictive factors and be able to report to them our predictive model of PH.
Please use the historical data set I am providing. Build and report the factors in BOTH a technical and non-technical report. I like to use Word and Excel. Please provide your non-technical report in a business friendly readable document and your predictions in an Excel readable format. The technical report should show clearly the models you tested and how you selected your final approach.
Please submit both Rpubs links and .rmd files or other readable formats for technical and non-technical reports. Also submit the excel file showing the prediction of your models for pH.
# Read in Excel file
bev_data_raw <- read_excel("data/StudentData.xlsx")
# Oberservations: 2571
# Columns: 33
dim(bev_data_raw)## [1] 2571 33
str(bev_data_raw)## tibble [2,571 × 33] (S3: tbl_df/tbl/data.frame)
## $ Brand Code : chr [1:2571] "B" "A" "B" "A" ...
## $ Carb Volume : num [1:2571] 5.34 5.43 5.29 5.44 5.49 ...
## $ Fill Ounces : num [1:2571] 24 24 24.1 24 24.3 ...
## $ PC Volume : num [1:2571] 0.263 0.239 0.263 0.293 0.111 ...
## $ Carb Pressure : num [1:2571] 68.2 68.4 70.8 63 67.2 66.6 64.2 67.6 64.2 72 ...
## $ Carb Temp : num [1:2571] 141 140 145 133 137 ...
## $ PSC : num [1:2571] 0.104 0.124 0.09 NA 0.026 0.09 0.128 0.154 0.132 0.014 ...
## $ PSC Fill : num [1:2571] 0.26 0.22 0.34 0.42 0.16 ...
## $ PSC CO2 : num [1:2571] 0.04 0.04 0.16 0.04 0.12 ...
## $ Mnf Flow : num [1:2571] -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
## $ Carb Pressure1 : num [1:2571] 119 122 120 115 118 ...
## $ Fill Pressure : num [1:2571] 46 46 46 46.4 45.8 45.6 51.8 46.8 46 45.2 ...
## $ Hyd Pressure1 : num [1:2571] 0 0 0 0 0 0 0 0 0 0 ...
## $ Hyd Pressure2 : num [1:2571] NA NA NA 0 0 0 0 0 0 0 ...
## $ Hyd Pressure3 : num [1:2571] NA NA NA 0 0 0 0 0 0 0 ...
## $ Hyd Pressure4 : num [1:2571] 118 106 82 92 92 116 124 132 90 108 ...
## $ Filler Level : num [1:2571] 121 119 120 118 119 ...
## $ Filler Speed : num [1:2571] 4002 3986 4020 4012 4010 ...
## $ Temperature : num [1:2571] 66 67.6 67 65.6 65.6 66.2 65.8 65.2 65.4 66.6 ...
## $ Usage cont : num [1:2571] 16.2 19.9 17.8 17.4 17.7 ...
## $ Carb Flow : num [1:2571] 2932 3144 2914 3062 3054 ...
## $ Density : num [1:2571] 0.88 0.92 1.58 1.54 1.54 1.52 0.84 0.84 0.9 0.9 ...
## $ MFR : num [1:2571] 725 727 735 731 723 ...
## $ Balling : num [1:2571] 1.4 1.5 3.14 3.04 3.04 ...
## $ Pressure Vacuum : num [1:2571] -4 -4 -3.8 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 ...
## $ PH : num [1:2571] 8.36 8.26 8.94 8.24 8.26 8.32 8.4 8.38 8.38 8.5 ...
## $ Oxygen Filler : num [1:2571] 0.022 0.026 0.024 0.03 0.03 0.024 0.066 0.046 0.064 0.022 ...
## $ Bowl Setpoint : num [1:2571] 120 120 120 120 120 120 120 120 120 120 ...
## $ Pressure Setpoint: num [1:2571] 46.4 46.8 46.6 46 46 46 46 46 46 46 ...
## $ Air Pressurer : num [1:2571] 143 143 142 146 146 ...
## $ Alch Rel : num [1:2571] 6.58 6.56 7.66 7.14 7.14 7.16 6.54 6.52 6.52 6.54 ...
## $ Carb Rel : num [1:2571] 5.32 5.3 5.84 5.42 5.44 5.44 5.38 5.34 5.34 5.34 ...
## $ Balling Lvl : num [1:2571] 1.48 1.56 3.28 3.04 3.04 3.02 1.44 1.44 1.44 1.38 ...
# 1 chr column: Brand Code
# Remainder are number columns
# PH is the result column
summary(bev_data_raw)## Brand Code Carb Volume Fill Ounces PC Volume
## Length:2571 Min. :5.040 Min. :23.63 Min. :0.07933
## Class :character 1st Qu.:5.293 1st Qu.:23.92 1st Qu.:0.23917
## Mode :character Median :5.347 Median :23.97 Median :0.27133
## Mean :5.370 Mean :23.97 Mean :0.27712
## 3rd Qu.:5.453 3rd Qu.:24.03 3rd Qu.:0.31200
## Max. :5.700 Max. :24.32 Max. :0.47800
## NA's :10 NA's :38 NA's :39
## Carb Pressure Carb Temp PSC PSC Fill
## Min. :57.00 Min. :128.6 Min. :0.00200 Min. :0.0000
## 1st Qu.:65.60 1st Qu.:138.4 1st Qu.:0.04800 1st Qu.:0.1000
## Median :68.20 Median :140.8 Median :0.07600 Median :0.1800
## Mean :68.19 Mean :141.1 Mean :0.08457 Mean :0.1954
## 3rd Qu.:70.60 3rd Qu.:143.8 3rd Qu.:0.11200 3rd Qu.:0.2600
## Max. :79.40 Max. :154.0 Max. :0.27000 Max. :0.6200
## NA's :27 NA's :26 NA's :33 NA's :23
## PSC CO2 Mnf Flow Carb Pressure1 Fill Pressure
## Min. :0.00000 Min. :-100.20 Min. :105.6 Min. :34.60
## 1st Qu.:0.02000 1st Qu.:-100.00 1st Qu.:119.0 1st Qu.:46.00
## Median :0.04000 Median : 65.20 Median :123.2 Median :46.40
## Mean :0.05641 Mean : 24.57 Mean :122.6 Mean :47.92
## 3rd Qu.:0.08000 3rd Qu.: 140.80 3rd Qu.:125.4 3rd Qu.:50.00
## Max. :0.24000 Max. : 229.40 Max. :140.2 Max. :60.40
## NA's :39 NA's :2 NA's :32 NA's :22
## Hyd Pressure1 Hyd Pressure2 Hyd Pressure3 Hyd Pressure4
## Min. :-0.80 Min. : 0.00 Min. :-1.20 Min. : 52.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 86.00
## Median :11.40 Median :28.60 Median :27.60 Median : 96.00
## Mean :12.44 Mean :20.96 Mean :20.46 Mean : 96.29
## 3rd Qu.:20.20 3rd Qu.:34.60 3rd Qu.:33.40 3rd Qu.:102.00
## Max. :58.00 Max. :59.40 Max. :50.00 Max. :142.00
## NA's :11 NA's :15 NA's :15 NA's :30
## Filler Level Filler Speed Temperature Usage cont Carb Flow
## Min. : 55.8 Min. : 998 Min. :63.60 Min. :12.08 Min. : 26
## 1st Qu.: 98.3 1st Qu.:3888 1st Qu.:65.20 1st Qu.:18.36 1st Qu.:1144
## Median :118.4 Median :3982 Median :65.60 Median :21.79 Median :3028
## Mean :109.3 Mean :3687 Mean :65.97 Mean :20.99 Mean :2468
## 3rd Qu.:120.0 3rd Qu.:3998 3rd Qu.:66.40 3rd Qu.:23.75 3rd Qu.:3186
## Max. :161.2 Max. :4030 Max. :76.20 Max. :25.90 Max. :5104
## NA's :20 NA's :57 NA's :14 NA's :5 NA's :2
## Density MFR Balling Pressure Vacuum
## Min. :0.240 Min. : 31.4 Min. :-0.170 Min. :-6.600
## 1st Qu.:0.900 1st Qu.:706.3 1st Qu.: 1.496 1st Qu.:-5.600
## Median :0.980 Median :724.0 Median : 1.648 Median :-5.400
## Mean :1.174 Mean :704.0 Mean : 2.198 Mean :-5.216
## 3rd Qu.:1.620 3rd Qu.:731.0 3rd Qu.: 3.292 3rd Qu.:-5.000
## Max. :1.920 Max. :868.6 Max. : 4.012 Max. :-3.600
## NA's :1 NA's :212 NA's :1
## PH Oxygen Filler Bowl Setpoint Pressure Setpoint
## Min. :7.880 Min. :0.00240 Min. : 70.0 Min. :44.00
## 1st Qu.:8.440 1st Qu.:0.02200 1st Qu.:100.0 1st Qu.:46.00
## Median :8.540 Median :0.03340 Median :120.0 Median :46.00
## Mean :8.546 Mean :0.04684 Mean :109.3 Mean :47.62
## 3rd Qu.:8.680 3rd Qu.:0.06000 3rd Qu.:120.0 3rd Qu.:50.00
## Max. :9.360 Max. :0.40000 Max. :140.0 Max. :52.00
## NA's :4 NA's :12 NA's :2 NA's :12
## Air Pressurer Alch Rel Carb Rel Balling Lvl
## Min. :140.8 Min. :5.280 Min. :4.960 Min. :0.00
## 1st Qu.:142.2 1st Qu.:6.540 1st Qu.:5.340 1st Qu.:1.38
## Median :142.6 Median :6.560 Median :5.400 Median :1.48
## Mean :142.8 Mean :6.897 Mean :5.437 Mean :2.05
## 3rd Qu.:143.0 3rd Qu.:7.240 3rd Qu.:5.540 3rd Qu.:3.14
## Max. :148.2 Max. :8.620 Max. :6.060 Max. :3.66
## NA's :9 NA's :10 NA's :1
# Check missing data
vis_miss(bev_data_raw)# Near Zero Variance Columns
nzv_cols <- nearZeroVar(bev_data_raw)
length(nzv_cols)## [1] 1
nzv_cols## [1] 13
# Apparently just column 13 (Hyd Pressure1)
bev_data_raw[13] ## # A tibble: 2,571 × 1
## `Hyd Pressure1`
## <dbl>
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
## 10 0
## # … with 2,561 more rows
corr <- bev_data_raw %>% drop_na() %>%
select(-c('Brand Code')) %>% cor()
corrplot(corr, method="number")# Currently unreadable below# Feature plot for the numeric predictor variables against the result variable PH
cols <- bev_data_raw %>%
select(-c('Brand Code', 'PH')) %>% colnames()
#featurePlot(bev_data_raw[,cols], bev_data_raw$PH, "ellipse")
#featurePlot(bev_data_raw[,cols], bev_data_raw$PH, "strip", jitter = TRUE)
#featurePlot(bev_data_raw[,cols], bev_data_raw$PH, "box")
#featurePlot(bev_data_raw[,cols], bev_data_raw$PH, "pairs")
featurePlot(bev_data_raw[,cols],
bev_data_raw$PH,
plot="scatter",
type = c("p", "smooth"),
span = .5,
layout=c(1,1))#30#
apply(bev_data_raw[,cols],2,shapiro.test)## $`Carb Volume`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.96797, p-value < 2.2e-16
##
##
## $`Fill Ounces`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.99317, p-value = 1.622e-09
##
##
## $`PC Volume`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.98309, p-value < 2.2e-16
##
##
## $`Carb Pressure`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.99681, p-value = 3.582e-05
##
##
## $`Carb Temp`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.99469, p-value = 6.316e-08
##
##
## $PSC
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.95337, p-value < 2.2e-16
##
##
## $`PSC Fill`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.9407, p-value < 2.2e-16
##
##
## $`PSC CO2`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.83089, p-value < 2.2e-16
##
##
## $`Mnf Flow`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.74864, p-value < 2.2e-16
##
##
## $`Carb Pressure1`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.99065, p-value = 8.597e-12
##
##
## $`Fill Pressure`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.91452, p-value < 2.2e-16
##
##
## $`Hyd Pressure1`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.88303, p-value < 2.2e-16
##
##
## $`Hyd Pressure2`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.80674, p-value < 2.2e-16
##
##
## $`Hyd Pressure3`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.81729, p-value < 2.2e-16
##
##
## $`Hyd Pressure4`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.95761, p-value < 2.2e-16
##
##
## $`Filler Level`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.86234, p-value < 2.2e-16
##
##
## $`Filler Speed`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.43993, p-value < 2.2e-16
##
##
## $Temperature
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.82493, p-value < 2.2e-16
##
##
## $`Usage cont`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.87732, p-value < 2.2e-16
##
##
## $`Carb Flow`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.75487, p-value < 2.2e-16
##
##
## $Density
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.83495, p-value < 2.2e-16
##
##
## $MFR
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.44664, p-value < 2.2e-16
##
##
## $Balling
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.79293, p-value < 2.2e-16
##
##
## $`Pressure Vacuum`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.96387, p-value < 2.2e-16
##
##
## $`Oxygen Filler`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.75741, p-value < 2.2e-16
##
##
## $`Bowl Setpoint`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.80247, p-value < 2.2e-16
##
##
## $`Pressure Setpoint`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.73661, p-value < 2.2e-16
##
##
## $`Air Pressurer`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.70652, p-value < 2.2e-16
##
##
## $`Alch Rel`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.719, p-value < 2.2e-16
##
##
## $`Carb Rel`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.94542, p-value < 2.2e-16
##
##
## $`Balling Lvl`
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.72188, p-value < 2.2e-16
ggplot(data = bev_data_raw) +
geom_bar(mapping = aes(x = `Brand Code`))ggplot(data = bev_data_raw, mapping = aes(x = `Brand Code`, y = PH)) +
geom_boxplot()